# Calculating emission rates for each pollutant
library(pacman)
p_load(
  here, data.table, fst, ggplot2, readxl, magrittr, 
  fixest, broom, dplyr, purrr, stringr, janitor
)

# Unit information table 
unit_info_dt = read.fst(
  here("Data/electricity-generation/unit-info-dt.fst"),
  as.data.table = TRUE
)

# Hourly emissions data
elec_gen_dt=
  read.fst(
    path = here("Data/electricity-generation/unit-model-fit/elec-gen-fit-dt.fst"),
    as.data.table = TRUE
  )

# Reading in egrid PM2.5 emissions rates 
egrid_unit_pm_dt = 
  read_xlsx(
    path = here('Data/electricity-generation/egrid/egrid_draft_pm2.5_emissions_7-20-20.xlsx'),
    sheet = '2018 PM Unit-level Data',
    skip = 1
  ) |> 
  data.table() %>% 
  # Two plants have duplicates because of multiple prime mover types
  # Taking the max of the two
  .[PRMVR != 'HY',
    # PM25RT is in lbs/mmbtu, converting to tons
    .(pm_tons_per_mmbtu = max(PM25RT/2000)),
    keyby = .(orispl_code = ORISPL, unitid = UNITID)
  ] 

# Calculating annual emissions for each unit 
emissions_dt_raw = 
  elec_gen_dt[,.(
    tot_nox_lbs = sum(nox_mass_lbs, na.rm = TRUE),
    tot_so2_lbs = sum(so2_mass_lbs, na.rm = TRUE),
    tot_co2_tons = sum(co2_mass_tons, na.rm = TRUE),
    tot_gload_mwh = sum(gload_mwh, na.rm = TRUE),
    tot_heat_input_mmbtu = sum(heat_input_mm_btu, na.rm = TRUE)), 
    keyby = .(orispl_code, unitid, year = year(utc_time))
  ]

# Calculating emissions rates for each pollutant
emissions_dt = 
  merge(
    emissions_dt_raw, 
    egrid_unit_pm_dt,
    by = c('orispl_code', 'unitid'),
    all.x = TRUE
  )[,.(
    orispl_code, unitid, year, 
    nox_lbs_per_mw = tot_nox_lbs/tot_gload_mwh, 
    so2_lbs_per_mw = tot_so2_lbs/tot_gload_mwh, 
    co2_tons_per_mw = tot_co2_tons/tot_gload_mwh, 
    pm25_tons_per_mw = pm_tons_per_mmbtu*tot_heat_input_mmbtu/tot_gload_mwh,
    tot_gload_mwh
  )]
# Currently have PM2.5 emissions rates for 94% of electricity generation
# (100% of all other pollutants)
# emissions_dt[,sum(ifelse(is.na(pm25_tons_per_mw), tot_gload_mwh, 0))/sum(tot_gload_mwh)]

# Saving the results 
fwrite(
  emissions_dt,
  file = here('Data/electricity-generation/output/emissions-rate-avg.csv')
)

# Fitting model with different emissions rates each tercile
# For Testing on a single unit:
   orispl_code_in = 3548 # SAVE THIS: 10308L
   unitid_in = 'GT-2A' # SAVE THIS: '1002'

emissions_rates_median = function(orispl_code_in, unitid_in, split = 0.5, print = FALSE){
  if(print == TRUE) {print(paste(orispl_code_in, unitid_in))}
  # Filtering to single unit, removing hours with no electricity but positive emissions
  unit_dt = elec_gen_dt[.(orispl_code_in, unitid_in)][
    !(gload_mwh == 0 & (nox_mass_lbs > 0|so2_mass_lbs > 0|co2_mass_tons > 0)) 
    ]
  # Finding quantile of generation
  unit_quants = quantile(unit_dt[gload_mwh>0]$gload_mwh, probs = split)
  unit_min = min(unit_dt$gload_mwh)
  # Calculating outliers 
  #extreme_dt = unit_dt[gload_mwh > 0,.(
  #  so2_lbs_per_mwh = mean(so2_mass_lbs/gload_mwh, na.rm = TRUE) 
  #                    + 6*sd(so2_mass_lbs/gload_mwh, na.rm = TRUE),
  #  nox_lbs_per_mwh = mean(nox_mass_lbs/gload_mwh, na.rm = TRUE) 
  #                    + 6*sd(nox_mass_lbs/gload_mwh, na.rm = TRUE),
  #  co2_tons_per_mwh = mean(co2_mass_tons/gload_mwh, na.rm = TRUE) 
  #                    + 6*sd(co2_mass_tons/gload_mwh, na.rm = TRUE)
  #)]
  # Adding indicators 
  unit_dt[, ':='(
    #not_so2_outlier = so2_mass_lbs/gload_mwh <= extreme_dt$so2_lbs_per_mwh | gload_mwh == 0,
    #not_nox_outlier = nox_mass_lbs/gload_mwh <= extreme_dt$nox_lbs_per_mwh | gload_mwh == 0,
    #not_co2_outlier = co2_mass_tons/gload_mwh <= extreme_dt$co2_tons_per_mwh | gload_mwh == 0,
    gload_over_split = gload_mwh >= unit_quants[1] & gload_mwh > unit_min,
    gload_msplit = gload_mwh - unit_quants[1]
  )]
  # Regression with split
  median_mod = feols(
    data = unit_dt,#[not_so2_outlier&not_nox_outlier&not_co2_outlier], 
    fml = c(nox_mass_lbs, so2_mass_lbs, co2_mass_tons) ~ 
      -1 + gload_mwh + i(gload_over_split, gload_msplit, ref = TRUE)
  )
  # NOX: Re-estimating if marginal rate is negative
  if(length(coef(median_mod[lhs = 'nox'])) == 2){
    if(
      (coef(median_mod[lhs = 'nox'], keep = 'gload_mwh') + 
       coef(median_mod[lhs = 'nox'], keep = 'gload_over_split') < 0
      )|coef(median_mod[lhs = 'nox'], keep = 'gload_mwh') < 0
    ){# Fitting model without median indicator
      rate_mod_nox = feols(
        data = unit_dt, 
        fml = nox_mass_lbs ~ -1 + gload_mwh 
      )
      # Collecting the results 
      median_out_nox = 
        rate_mod_nox |> 
        tidy() |> 
        mutate(
          pollutant = 'nox',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          split = split,
          gload_mwh_split = unit_quants[1]
        )
    }else{
      median_out_nox = 
        median_mod[lhs = 'nox'] |> 
        tidy() |> 
        mutate(
          pollutant = 'nox',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          split = split,
          gload_mwh_split = unit_quants[1]
        )
    }
  } else if(length(coef(median_mod[lhs = 'nox'])) == 1) {
    median_out_nox = 
      median_mod[lhs = 'nox'] |> 
      tidy() |> 
      mutate(
        pollutant = 'nox',
        orispl_code = orispl_code_in,
        unitid = unitid_in,
        split = split,
        gload_mwh_split = unit_quants[1]
      )
  } else {
    median_out_nox = tibble(
      term = NA,
      estimate = NA, 
      std.error = NA,
      statistic = NA, 
      p.value = NA,
      pollutant = 'nox',
      orispl_code = orispl_code_in,
      unitid = unitid_in,
      split = split,
      gload_mwh_split = unit_quants[1]
    )
  }
  # SO2: Re-estimating if marginal rate is negative
  if(length(coef(median_mod[lhs = 'so2'])) == 2){
    if(
      (coef(median_mod[lhs = 'so2'], keep = 'gload_mwh') + 
       coef(median_mod[lhs = 'so2'], keep = 'gload_over_split') < 0
      )|coef(median_mod[lhs = 'so2'], keep = 'gload_mwh') < 0
    ){# Fitting model without median indicator
      rate_mod_so2 = feols(
        data = unit_dt, 
        fml = so2_mass_lbs ~ -1 + gload_mwh 
      )
      # Collecting the results 
      median_out_so2 = 
        rate_mod_so2 |> 
        tidy() |> 
        mutate(
          pollutant = 'so2',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          split = split,
          gload_mwh_split = unit_quants[1]
        )
    }else{
      median_out_so2 = 
        median_mod[lhs = 'so2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'so2',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          split = split,
          gload_mwh_split = unit_quants[1]
        )
    }
  } else if(length(coef(median_mod[lhs = 'so2'])) == 1) {
    median_out_so2 = 
      median_mod[lhs = 'so2'] |> 
      tidy() |> 
      mutate(
        pollutant = 'so2',
        orispl_code = orispl_code_in,
        unitid = unitid_in,
        split = split,
        gload_mwh_split = unit_quants[1]
      )
  } else {
    median_out_so2 = tibble(
      term = NA,
      estimate = NA, 
      std.error = NA,
      statistic = NA, 
      p.value = NA,
      pollutant = 'so2',
      orispl_code = orispl_code_in,
      unitid = unitid_in,
      split = split,
      gload_mwh_split = unit_quants[1]
    )
  }
  # CO2: Re-estimating if marginal rate is negative
  if(length(coef(median_mod[lhs = 'co2'])) == 2){
    if(
      (coef(median_mod[lhs = 'co2'], keep = 'gload_mwh') + 
       coef(median_mod[lhs = 'co2'], keep = 'gload_over_split') < 0
      ) | coef(median_mod[lhs = 'co2'], keep = 'gload_mwh') < 0
    ){# Fitting model without median indicator
      rate_mod_co2 = feols(
        data = unit_dt, 
        fml = co2_mass_tons ~ -1 + gload_mwh 
      )
      # Collecting the results 
      median_out_co2 = 
        rate_mod_co2 |> 
        tidy() |> 
        mutate(
          pollutant = 'co2',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          split = split,
          gload_mwh_split = unit_quants[1]
        )
    }else{
      median_out_co2 = 
        median_mod[lhs = 'co2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'co2',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          split = split,
          gload_mwh_split = unit_quants[1]
        )
    }
  } else if(length(coef(median_mod[lhs = 'co2'])) == 1) {
    median_out_co2 = 
      median_mod[lhs = 'co2'] |> 
      tidy() |> 
      mutate(
        pollutant = 'co2',
        orispl_code = orispl_code_in,
        unitid = unitid_in,
        split = split,
        gload_mwh_split = unit_quants[1]
      )
  }else{
    median_out_co2 = tibble(
      term = NA,
      estimate = NA, 
      std.error = NA,
      statistic = NA, 
      p.value = NA,
      pollutant = 'co2',
      orispl_code = orispl_code_in,
      unitid = unitid_in,
      split = split,
      gload_mwh_split = unit_quants[1]
    )
  }
  # Returning the results 
  return(
    rbind(median_out_nox, median_out_so2, median_out_co2) %>% as.data.table() 
  )
}  

# Running models for all units 
emissions_rate_dt = 
  pmap_dfr(
    elec_gen_dt[,.(orispl_code, unitid)] |> unique(),
    emissions_rates_median,
    print = TRUE
  )

 # Cleaning up the results
clean_emissions_rate_dt = 
  rbind(
    emissions_rate_dt[
      term == 'gload_mwh',.(
        orispl_code, unitid, pollutant, 
        output = 'emissions_rate_low',
        gload_mwh_split,
        emissions_rate = estimate
      )],
    emissions_rate_dt[!is.na(term)] %>% 
      dcast(
        orispl_code + unitid + pollutant + gload_mwh_split ~ term, 
        value.var = 'estimate', 
        fill= 0
      ) %>% 
      .[,.(
        orispl_code, unitid, pollutant, 
        output = 'emissions_rate_high',
        gload_mwh_split,
        emissions_rate = gload_mwh + `gload_over_split::TRUE:gload_msplit`
      )]
  ) 
  #dcast(
  #  orispl_code + unitid + gload_mwh_split ~ output + pollutant,
  #  value.var = 'emissions_rate'
  #) 

# Saving results 
write.fst(
  emissions_rate_dt,
  here('Data/electricity-generation/unit-model-fit/emissions-rate-dt-raw.fst')
)
write.fst(
  clean_emissions_rate_dt,
  here('Data/electricity-generation/unit-model-fit/emissions-rate-dt.fst')
)
write.csv(
  clean_emissions_rate_dt,
  here('Data/electricity-generation/output/emissions-rate-median.csv')
)

# ---------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------
# Below here is old code testing different versions of the above model ------------------
# ---------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------

test_emissions_rate = function(orispl_code_in, unitid_in, size = 0.1){
  print(paste(orispl_code_in, unitid_in))
  # Filtering to single unit
  test_dt = elec_gen_dt[orispl_code == orispl_code_in & unitid == unitid_in]
  
  # Finding quantiles of generation
  unit_quants = quantile(
    test_dt$gload_mw_tot, 
    probs = c(seq(0,1,by = size), 0.75, 1/3, 2/3), 
    na.rm = TRUE
  )
  
  # Adding indicators 
  test_dt[, ':='(
    # Indicator for over 75 to test whether we want to use single rate
    gload_over_p75 = gload_mw_tot >= unit_quants["75%"] & 
                      gload_mw_tot > unit_quants["0%"],
    # Indicators for terciles 
    gload_over_p33 = gload_mw_tot >= unit_quants['33.33333%'] & 
                      gload_mw_tot > unit_quants["0%"],
    gload_over_p50 = gload_mw_tot >= unit_quants['50%'] & 
                      gload_mw_tot > unit_quants["0%"],
    gload_over_p66 = gload_mw_tot >= unit_quants['66.66667%'] & 
                      gload_mw_tot > unit_quants['33.33333%'],
    # Subtracting tercile quantities
    gload_mw_m33 = gload_mw_tot - unit_quants['33.33333%'],
    gload_mw_m50 = gload_mw_tot - unit_quants['50%'],
    gload_mw_m66 = gload_mw_tot - unit_quants['66.66667%']
  )]
  
  
  # Running regression testing significance of over 75th interaction
  test_mod = feols(
    data = test_dt, 
    fml = c(nox_mass_lbs, so2_mass_lbs, co2_mass_tons) ~ 
      -1 + gload_mw_tot + i(gload_over_p75,gload_mw_tot, ref = TRUE)
  )
  
  # Running the regression with terciles 
  if(unit_quants['33.33333%'] == unit_quants['66.66667%']){
    tercile_fml = 'c(nox_mass_lbs, so2_mass_lbs, co2_mass_tons) ~ 
      -1 + gload_mw_tot + 
      i(gload_over_p33,gload_mw_m33, ref = TRUE) ' |> as.formula()
    tercile_int_fml = 'c(nox_mass_lbs, so2_mass_lbs, co2_mass_tons) ~ 
      gload_mw_tot + 
      i(gload_over_p33,gload_mw_m33, ref = TRUE) ' |> as.formula()
  }else{
    tercile_fml = 'c(nox_mass_lbs, so2_mass_lbs, co2_mass_tons) ~ 
      -1 + gload_mw_tot + 
      i(gload_over_p33,gload_mw_m33, ref = TRUE) +
      i(gload_over_p66,gload_mw_m66, ref = TRUE) ' |> as.formula()
    tercile_int_fml = 'c(nox_mass_lbs, so2_mass_lbs, co2_mass_tons) ~ 
      gload_mw_tot + 
      i(gload_over_p33,gload_mw_m33, ref = TRUE) +
      i(gload_over_p66,gload_mw_m66, ref = TRUE) ' |> as.formula()
  }
  
  tercile_mod = feols(
    data = test_dt, 
    fml = tercile_fml
  )
  tercile_int_mod = feols(
    data = test_dt, 
    fml = tercile_int_fml
  )
  
  #p_load(splines)
  #ggplot(
  #  test_dt[,.(
  #    gload_mw_tot, 
  #    nox_mass_lbs,
  #    pred_nox_mass_lbs = tercile_mod[[1]]$fitted.values,
  #    pred_nox_mass_lbs_int = tercile_int_mod[[1]]$fitted.values
  #  )]
  #) + 
  #  geom_point(aes(x= gload_mw_tot, y = nox_mass_lbs), color = 'black')+ 
  #  geom_line(aes(x= gload_mw_tot, y = pred_nox_mass_lbs), color = 'red') + 
  #  geom_line(aes(x= gload_mw_tot, y = pred_nox_mass_lbs_int), color = 'green') + 
  #  geom_vline(xintercept = unit_quants[1], linetype = 'dashed')+ 
  #  geom_vline(xintercept = unit_quants[1], linetype = 'dashed') + 
  #  labs(x = "Electricity Production (MW)", y = "NOx Emissions (lbs)") +
  #  theme_minimal()
  #  
  #ggplot(test_dt, aes(x = gload_mw_tot, y = nox_mass_lbs/gload_mw_tot)) + 
  #  geom_point() + 
  #  geom_smooth(method = 'lm', se = FALSE)+ 
  #  geom_smooth(method = 'lm',color ='red', formula = y ~ x + I(x^2), se = FALSE) + 
  #  labs(x = "Electricity Production (MW)", y = "NOx Emissions per MW (lbs/MW)") +
  #  theme_minimal()
    
  test_out = 
    rbind(
      test_mod[lhs = 'nox'] |> 
        tidy() |> 
        mutate(
          pollutant = 'nox',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      test_mod[lhs = 'so2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'so2',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      test_mod[lhs = 'co2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'co2',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        )
    ) |> data.table()
  
  tercile_out = 
    rbind(
      tercile_mod[lhs = 'nox'] |> 
      tidy() |> 
      mutate(
        pollutant = 'nox',
        orispl_code = orispl_code_in,
        unitid = unitid_in,
        gload_mw_p33 = unit_quants['33.33333%'],
        gload_mw_p66 = unit_quants['66.66667%']
      ),
    tercile_mod[lhs = 'so2'] |> 
      tidy() |> 
      mutate(
        pollutant = 'so2',
        orispl_code = orispl_code_in,
        unitid = unitid_in,
        gload_mw_p33 = unit_quants['33.33333%'],
        gload_mw_p66 = unit_quants['66.66667%']
      ),
    tercile_mod[lhs = 'co2'] |> 
      tidy() |> 
      mutate(
        pollutant = 'co2',
        orispl_code = orispl_code_in,
        unitid = unitid_in,
        gload_mw_p33 = unit_quants['33.33333%'],
        gload_mw_p66 = unit_quants['66.66667%']
      )
    ) |> data.table()
  
  rate_out = 
    rbind(
      rate_mod[lhs = 'nox'][[1]] |>
      tidy()|> 
      mutate(
        pollutant = 'nox',
        degree = 'linear',
        orispl_code = orispl_code_in,
        unitid = unitid_in
      ),
      rate_mod[lhs = 'nox'][[2]] |>
        tidy()|> 
        mutate(
          pollutant = 'nox',
          degree = 'quadratic',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      rate_mod[lhs = 'so2'][[1]] |>
        tidy()|> 
        mutate(
          pollutant = 'so2',
          degree = 'linear',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      rate_mod[lhs = 'so2'][[2]] |>
        tidy()|> 
        mutate(
          pollutant = 'so2',
          degree = 'quadratic',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      rate_mod[lhs = 'co2'][[1]] |>
        tidy()|> 
        mutate(
          pollutant = 'co2',
          degree = 'linear',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      rate_mod[lhs = 'co2'][[2]] |>
        tidy()|> 
        mutate(
          pollutant = 'co2',
          degree = 'quadratic',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        )
    ) |> data.table()
  
  tercile_int_out = 
    rbind(
      tercile_int_mod[lhs = 'nox'] |> 
        tidy() |> 
        mutate(
          pollutant = 'nox',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          gload_mw_p33 = unit_quants['33.33333%'],
          gload_mw_p66 = unit_quants['66.66667%']
        ),
      tercile_int_mod[lhs = 'so2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'so2',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          gload_mw_p33 = unit_quants['33.33333%'],
          gload_mw_p66 = unit_quants['66.66667%']
        ),
      tercile_int_mod[lhs = 'co2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'co2',
          orispl_code = orispl_code_in,
          unitid = unitid_in,
          gload_mw_p33 = unit_quants['33.33333%'],
          gload_mw_p66 = unit_quants['66.66667%']
        )
    ) |> data.table()
  
  all_out = rbind(
    test_out |> mutate(gload_mw_p33 = NA, gload_mw_p66 = NA, degree = NA, model = 'test'),
    tercile_out |> mutate(model = 'tercile', degree = NA),
    tercile_int_out |> mutate(model = 'tercile intercept', degree = NA),
    rate_out |> mutate(gload_mw_p33 = NA, gload_mw_p66 = NA, model = 'rate'),
    use.names = TRUE
  )
  
  return(all_out)
}

# Running models for all units 
test_emissions_rate_dt = 
  pmap_dfr(
    elec_gen_dt[,.(orispl_code, unitid)] |> unique(),
    test_emissions_rate
  )

write.fst(
  test_emissions_rate_dt,
  here('Data/electricity-generation/unit-model-fit/test-emissions-rate-dt.fst')
)


# Reading in the previously run data
test_emissions_rate_dt =
  read.fst(
    here('Data/electricity-generation/unit-model-fit/test-emissions-rate-dt.fst'),
    as.data.table = TRUE
  )

theme_set(theme_minimal())

# Plotting the distribution of p-values
ggplot(test_emissions_rate_dt[model == 'test'], aes(x = p.value)) +
  geom_histogram(binwidth = 0.01)

# Seems like a bad idea to do a constant emissions rate
test_emissions_rate_dt[
  model == 'test' & 
  !is.na(p.value) &
  term == 'gload_over_p75::TRUE:gload_mw_tot',.(
    pct_significant_01 = mean(p.value < 0.01),
    pct_significant_05 = mean(p.value < 0.05),
    pct_significant_10 = mean(p.value < 0.10)), 
  keyby = .(pollutant)
]

# How many coefficients from tercile model are negative? 
  test_emissions_rate_dt[
    str_detect(model,'tercile'),
    .(model,orispl_code, unitid, pollutant, term, estimate)
  ] |>
  dcast(
    model + orispl_code + unitid + pollutant ~ term, 
    value.var = 'estimate'
  ) |>
  setnames(
    old = c('gload_mw_tot', 'gload_over_p33::TRUE:gload_mw_m33','gload_over_p66::TRUE:gload_mw_m66'),
    new = c('coef', 'coef_33','coef_66')
  ) %>% 
    .[,.(
      model, orispl_code, unitid, pollutant, 
      emissions_rate_00_33 = coef, 
      emissions_rate_33_66 = coef + coef_33, 
      emissions_rate_66_99 = coef + coef_33 + coef_66
    )] %>% 
    .[,.(
      pct_positive_00_33 = mean(emissions_rate_00_33 > 0, na.rm = TRUE),
      pct_positive_33_66 = mean(emissions_rate_33_66 > 0, na.rm = TRUE),
      pct_positive_66_99 = mean(emissions_rate_66_99 > 0, na.rm = TRUE),
      pct_positive_all = mean(emissions_rate_00_33 > 0 & emissions_rate_33_66 > 0 & emissions_rate_66_99 >0, na.rm = TRUE)),
      keyby = .(model,pollutant)
    ]

# Only 17% of 2nd terciles are positive, 70% of third
test_emissions_rate_dt[
  model =='tercile',
  .(tot_neg = sum(estimate < 0, na.rm =TRUE)),
  keyby = .(orispl_code, unitid, pollutant)
][,.(mean(tot_neg > 0))]



# How do the rate models look?
linear_rate_dt = 
  test_emissions_rate_dt[
    model == 'rate' & degree == 'linear',
    .(orispl_code, unitid, pollutant, term, estimate)
  ] |>
  dcast(
    orispl_code + unitid + pollutant~ term,
    value.var = 'estimate'
  ) |> 
  setnames(
    old = c('(Intercept)','gload_mw_tot'),
    new = c('intercept','coef')
  ) %>% 
  .[,zero := -intercept/coef] |>
  merge(
    elec_gen_dt[,.(max_gload_mw_tot = max(gload_mw_tot)), keyby = .(orispl_code, unitid)],
    by = c('orispl_code','unitid')
  )

linear_rate_dt[,.(
  pct_positive_intercept = mean(intercept > 0, na.rm = TRUE),
  pct_increasing_emissions_rate = mean(coef > 0, na.rm = TRUE),
  pct_max_obs_below_model_zero = mean(zero > max_gload_mw_tot, na.rm = TRUE)),
  keyby = .(pollutant)
]


quadratic_rate_dt = 
  test_emissions_rate_dt[
    model == 'rate' & degree == 'quadratic',
    .(orispl_code, unitid, pollutant, term, estimate)
  ] |>
  dcast(
    orispl_code + unitid + pollutant~ term,
    value.var = 'estimate'
  ) |> 
  setnames(
    old = c('(Intercept)','gload_mw_tot','I(gload_mw_tot^2)'),
    new = c('intercept','coef','coef_sq')
  ) |>
  merge(
    elec_gen_dt[,.(
      max_gload_mw_tot = max(gload_mw_tot),
      sum_gload_mw_tot = sum(gload_mw_tot)), 
      keyby = .(orispl_code, unitid)],
    by = c('orispl_code','unitid')
  ) %>% 
  .[,':='(
    min = intercept + coef*(-coef/(2*coef_sq)) + coef_sq*(-coef/(2*coef_sq))^2,
    est_at_max = intercept + coef*max_gload_mw_tot + coef_sq*max_gload_mw_tot^2
  )] 

quadratic_rate_dt[,.(
  pct_positive_intercept = mean(intercept > 0, na.rm = TRUE),
  pct_positive_min = mean(min > 0, na.rm = TRUE),
  pct_est_at_max_positive = mean(est_at_max > 0, na.rm = TRUE),
  pct_all_positive = mean(intercept > 0 & min > 0 & est_at_max> 0, na.rm = TRUE),
  pct_all_positive_wt = weighted.mean(intercept > 0 & min > 0 & est_at_max> 0, w = sum_gload_mw_tot,na.rm = TRUE),
  pct_convex = mean(coef_sq < 0, na.rm = TRUE)),
  keyby = .(pollutant)
]

merge(
  CJ(
    pollutant = c('co2','nox','so2'),
    gload = seq(0, 1000, by = 5)
  ),
  quadratic_rate_dt[intercept > 0 & min > 0 & est_at_max> 0], 
  #quadratic_rate_dt[orispl_code == '2632' & unitid == '1'],
  by = 'pollutant',
  allow.cartesian = TRUE
)[max_gload_mw_tot >= gload, .(
  orispl_code, unitid, pollutant,
  gload,
  pred_emissions = intercept + coef*gload + coef_sq*gload^2
)] |>
  ggplot(aes(x = gload, y = pred_emissions)) + 
    geom_line(aes(group = paste(orispl_code,unitid)),alpha = 0.2) + 
    geom_smooth() +
    facet_wrap(~pollutant, scales = 'free')




# Look at residuals from running the regression 
test_emissions_rate_resid = function(orispl_code_in, unitid_in){
  print(paste(orispl_code_in, unitid_in))
  # Filtering to single unit
  test_dt = cems_dt[
    orispl_code == orispl_code_in 
    & unitid == unitid_in
    & !is.na(gload_mw)
    & !is.na(nox_mass_lbs)
    & !is.na(so2_mass_lbs)
    & !is.na(co2_mass_tons)
  ]
  
  # calculating the 75th percentile of generation 
  test_dt[, 
    gload_over_p75 := gload_mw >= quantile(test_dt$gload_mw, probs = 0.75, na.rm =TRUE)
      & gload_mw > min(test_dt$gload_mw, na.rm =TRUE)
  ]
  
  # Running the regression
  test_mod = feols(
    data = test_dt, 
    fml = c(nox_mass_lbs, so2_mass_lbs, co2_mass_tons) ~ 
      -1 + gload_mw
  )
  
  test_out = 
    cbind(
      test_dt[,.(orispl_code, unitid, gload_mw, nox_mass_lbs, so2_mass_lbs, co2_mass_tons)],
      data.table(
        nox = residuals(test_mod[lhs = 'nox']),
        so2 = residuals(test_mod[lhs = 'so2']),
        co2 = residuals(test_mod[lhs = 'co2'])
      )
    ) |>
    melt(
      id.vars = c('orispl_code', 'unitid', 'gload_mw', 'nox_mass_lbs', 'so2_mass_lbs', 'co2_mass_tons'),
      variable.name = 'pollutant',
      value.name = 'residual'
    )
  
  test_out = 
    rbind(
      test_mod[lhs = 'nox'] |> 
        tidy() |> 
        mutate(
          pollutant = 'nox',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      test_mod[lhs = 'so2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'so2',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        ),
      test_mod[lhs = 'co2'] |> 
        tidy() |> 
        mutate(
          pollutant = 'co2',
          orispl_code = orispl_code_in,
          unitid = unitid_in
        )
    ) |> data.table()
  
  
  return(test_out)
}

theme_set(theme_minimal())
ggplot(test_out, aes(x = gload_mw, y = residual)) + 
  geom_point() +
  geom_smooth() +
  facet_wrap(~pollutant, scales = 'free')

resid_dt = 
  merge(
    cems_dt, 
    emissions_dt, 
    by = c('orispl_code','unitid')
  )[,':='(
    pred_nox = nox_lbs_per_mw*gload_mw,
    resid_nox = nox_mass_lbs - nox_lbs_per_mw*gload_mw,
    pred_so2 = so2_lbs_per_mw*gload_mw,
    resid_so2 = so2_mass_lbs - so2_lbs_per_mw*gload_mw,
    pred_co2 = co2_tons_per_mw*gload_mw,
    resid_co2 = co2_mass_tons - co2_tons_per_mw*gload_mw
  )]

ggplot(resid_dt[!is.na(gload_mw)], aes(x = gload_mw, y = resid_nox)) + 
  geom_point(alpha = 0.01)


